-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Allow shifting instead of masks if the types are legal. #162732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: AZero13 (AZero13) ChangesFull diff: https://github.com/llvm/llvm-project/pull/162732.diff 3 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 88691b931a8d8..9865143edbdfb 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -839,8 +839,11 @@ class LLVM_ABI TargetLoweringBase {
/// Return true if the variant with 2 variable shifts is preferred.
/// Return false if there is no preference.
virtual bool shouldFoldMaskToVariableShiftPair(SDValue X) const {
- // By default, let's assume that no one prefers shifts.
- return false;
+ // By default, let's assume that no one prefers shifts for vectors
+ EVT VT = X.getValueType();
+
+ // Prefer shifts for legal types
+ return isOperationLegal(ISD::SHL, VT);
}
/// Return true if it is profitable to fold a pair of shifts into a mask.
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 89bd5f1dc7969..59bc1dd019de3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -103,16 +103,16 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT: v_and_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bzhi32_c0:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT: v_and_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%numhighbits = sub i32 32, %numlowbits
%mask = lshr i32 -1, %numhighbits
@@ -121,12 +121,23 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
}
define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_c0_clamp:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 31, v1
-; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: bzhi32_c0_clamp:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 31, v1
+; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_c0_clamp:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v1, 31, v1
+; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
%low5bits = and i32 %numlowbits, 31
%numhighbits = sub i32 32, %low5bits
%mask = lshr i32 -1, %numhighbits
@@ -139,16 +150,16 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT: v_and_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bzhi32_c1_indexzext:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v1, 32, v1
-; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT: v_and_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%numhighbits = sub i8 32, %numlowbits
%sh_prom = zext i8 %numhighbits to i32
@@ -162,16 +173,16 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT: v_and_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bzhi32_c4_commutative:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT: v_and_b32_e32 v0, v0, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%numhighbits = sub i32 32, %numlowbits
%mask = lshr i32 -1, %numhighbits
diff --git a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
index 5b21a3652d11c..1055fc101a6c5 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
@@ -266,7 +266,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -279,9 +279,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR * T0.W, literal.x, PV.W,
-; EG-NEXT: -1(nan), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.X, PV.W, KC0[2].Y,
+; EG-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
+; EG-NEXT: LSHR T0.X, PV.W, T0.W,
; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
@@ -289,7 +288,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
-; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
@@ -302,9 +301,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T0.W, PV.W, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: LSHR * T0.W, literal.x, PV.W,
-; CM-NEXT: -1(nan), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T0.X, PV.W, KC0[2].Y,
+; CM-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
+; CM-NEXT: LSHR * T0.X, PV.W, T0.W,
; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%numhighbits = sub i8 32, %numlowbits
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SI-NEXT: v_and_b32_e32 v1, 31, v1 | ||
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 | ||
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 | ||
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regression, broke a BFE pattern?
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 | ||
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a neutral change at best
Why? I don't see any tests that show a benefit. I guess it could conceivably reduce register pressure, since the mask has to be materialized in a register, but that would only help if the shift amount has more uses, and there aren't any tests cases for that. |
No description provided.